{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hello Loopy: Computing a Rank-One Matrix\n",
    "\n",
    "## Setup Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pyopencl as cl\n",
    "import pyopencl.array\n",
    "import pyopencl.clrandom\n",
    "import loopy as lp\n",
    "\n",
    "from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Choose platform:\n",
      "[0] <pyopencl.Platform 'Portable Computing Language' at 0x7f13b1ef27a8>\n",
      "[1] <pyopencl.Platform 'Intel(R) OpenCL' at 0x359a118>\n"
     ]
    },
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Choice [0]: 0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Set the environment variable PYOPENCL_CTX='0' to avoid being asked again.\n"
     ]
    }
   ],
   "source": [
    "ctx = cl.create_some_context(interactive=True)\n",
    "queue = cl.CommandQueue(ctx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "n = 1024\n",
    "a = cl.clrandom.rand(queue, n, dtype=np.float32)\n",
    "b = cl.clrandom.rand(queue, n, dtype=np.float32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Initial Kernel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "knl = lp.make_kernel(\n",
    "    \"{[i,j]: 0<=i,j<n}\",\n",
    "    \"c[i, j] = a[i]*b[j]\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= \u001b[34m-1\u001b[39;49;00m + n; ++j)\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= \u001b[34m-1\u001b[39;49;00m + n; ++i)\n",
      "      c[n * i + j] = a[i] * b[j];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "knl = lp.set_options(knl, write_cl=True)\n",
    "evt, (mat,) = knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transforming kernels: Loop Splitting\n",
    "\n",
    "Next: transform kernel. Example: Split a loop into fixed-length \"chunks\"."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int8, char) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int16, short) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int32, int) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int64, long)\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    { \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m        if (a<0) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m            a = a - (b-1); \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m        return a\u001b[39;49;00m\u001b[36m/\u001b[39;49;00m\u001b[36mb; \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    }\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mundef LOOPY_DEFINE_FLOOR_DIV_POS_B\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mundef LOOPY_CALL_WITH_INTEGER_TYPES\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= \u001b[34m-1\u001b[39;49;00m + n; ++j)\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i_outer = \u001b[34m0\u001b[39;49;00m; i_outer <= \u001b[34m-1\u001b[39;49;00m + loopy_floor_div_pos_b_int32(\u001b[34m3\u001b[39;49;00m + n, \u001b[34m4\u001b[39;49;00m); ++i_outer)\n",
      "      \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i_inner = \u001b[34m0\u001b[39;49;00m; i_inner <= (\u001b[34m-4\u001b[39;49;00m + n + \u001b[34m-4\u001b[39;49;00m * i_outer >= \u001b[34m0\u001b[39;49;00m ? \u001b[34m3\u001b[39;49;00m : \u001b[34m-1\u001b[39;49;00m + n + \u001b[34m-4\u001b[39;49;00m * i_outer); ++i_inner)\n",
      "        c[n * (\u001b[34m4\u001b[39;49;00m * i_outer + i_inner) + j] = a[\u001b[34m4\u001b[39;49;00m * i_outer + i_inner] * b[j];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "isplit_knl = knl\n",
    "isplit_knl = lp.split_iname(isplit_knl, \"i\", 4)\n",
    "\n",
    "evt, (mat,) = isplit_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Want to get rid of the conditional?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transforming kernels: Implementation Tags\n",
    "\n",
    "Every loop axis (\"iname\") comes with an *implementation tag*."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine LOOPY_CALL_WITH_INTEGER_TYPES(MACRO_NAME) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int8, char) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int16, short) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int32, int) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    MACRO_NAME(int64, long)\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    { \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m        if (a<0) \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m            a = a - (b-1); \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m        return a\u001b[39;49;00m\u001b[36m/\u001b[39;49;00m\u001b[36mb; \\\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m    }\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B)\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mundef LOOPY_DEFINE_FLOOR_DIV_POS_B\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mundef LOOPY_CALL_WITH_INTEGER_TYPES\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= \u001b[34m-1\u001b[39;49;00m + n; ++j)\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i_outer = \u001b[34m0\u001b[39;49;00m; i_outer <= loopy_floor_div_pos_b_int32(\u001b[34m-4\u001b[39;49;00m + n, \u001b[34m4\u001b[39;49;00m); ++i_outer)\n",
      "    {\n",
      "      c[n * \u001b[34m4\u001b[39;49;00m * i_outer + j] = a[\u001b[34m4\u001b[39;49;00m * i_outer] * b[j];\n",
      "      c[n * (\u001b[34m4\u001b[39;49;00m * i_outer + \u001b[34m1\u001b[39;49;00m) + j] = a[\u001b[34m1\u001b[39;49;00m + \u001b[34m4\u001b[39;49;00m * i_outer] * b[j];\n",
      "      c[n * (\u001b[34m4\u001b[39;49;00m * i_outer + \u001b[34m2\u001b[39;49;00m) + j] = a[\u001b[34m2\u001b[39;49;00m + \u001b[34m4\u001b[39;49;00m * i_outer] * b[j];\n",
      "      c[n * (\u001b[34m4\u001b[39;49;00m * i_outer + \u001b[34m3\u001b[39;49;00m) + j] = a[\u001b[34m3\u001b[39;49;00m + \u001b[34m4\u001b[39;49;00m * i_outer] * b[j];\n",
      "    }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "isplit_knl = knl\n",
    "isplit_knl = lp.assume(isplit_knl, \"n mod 4 = 0\")\n",
    "isplit_knl = lp.split_iname(isplit_knl, \"i\", 4)\n",
    "isplit_knl = lp.tag_inames(isplit_knl, {\"i_inner\": \"unr\"})\n",
    "\n",
    "evt, (mat,) = isplit_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "May want to influence loop ordering."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "----\n",
    "\"Map to GPU hw axis\" is an iname tag as well.\n",
    "\n",
    "Use shortcuts for less typing:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "    c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)] = a[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)] * b[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)];\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "split_knl = knl\n",
    "split_knl = lp.split_iname(split_knl, \"i\", 16,\n",
    "        outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
    "split_knl = lp.split_iname(split_knl, \"j\", 16,\n",
    "        outer_tag=\"g.1\", inner_tag=\"l.1\")\n",
    "\n",
    "evt, (mat,) = split_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transforming kernels: Leveraging data reuse\n",
    "\n",
    "Better! But still not much data reuse."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[36mfloat\u001b[39;49;00m a_fetch;\n",
      "  \u001b[36mfloat\u001b[39;49;00m b_fetch;\n",
      "\n",
      "  \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m i = \u001b[34m0\u001b[39;49;00m; i <= \u001b[34m-1\u001b[39;49;00m + n; ++i)\n",
      "  {\n",
      "    a_fetch = a[i];\n",
      "    \u001b[34mfor\u001b[39;49;00m (\u001b[36mint\u001b[39;49;00m j = \u001b[34m0\u001b[39;49;00m; j <= \u001b[34m-1\u001b[39;49;00m + n; ++j)\n",
      "    {\n",
      "      b_fetch = b[j];\n",
      "      c[n * i + j] = a_fetch * b_fetch;\n",
      "    }\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "fetch1_knl = knl\n",
    "\n",
    "fetch1_knl = lp.add_prefetch(fetch1_knl, \"a\", fetch_outer_inames=\"i\")\n",
    "fetch1_knl = lp.add_prefetch(fetch1_knl, \"b\", fetch_outer_inames=\"i,j\")\n",
    "\n",
    "evt, (mat,) = fetch1_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "But this is useless for the GPU version. (demo)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "\n",
    "Would like to fetch entire \"access footprint\" of a loop."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[36mfloat\u001b[39;49;00m a_fetch[\u001b[34m16\u001b[39;49;00m];\n",
      "  \u001b[36mfloat\u001b[39;49;00m b_fetch[\u001b[34m16\u001b[39;49;00m];\n",
      "\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      b_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = b[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = a[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)] = a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] * b_fetch[lid(\u001b[34m1\u001b[39;49;00m)];\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "fetch_knl = split_knl\n",
    "\n",
    "fetch_knl = lp.add_prefetch(fetch_knl, \"a\", [\"i_inner\"], default_tag=\"l.auto\")\n",
    "fetch_knl = lp.add_prefetch(fetch_knl, \"b\", [\"j_inner\"], default_tag=\"l.auto\")\n",
    "\n",
    "fetch_knl = lp.add_inames_for_unused_hw_axes(fetch_knl, \"id:*fetch*\")\n",
    "evt, (mat,) = fetch_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Transforming kernels: Eliminating Conditionals\n",
    "\n",
    "All those conditionals take time to evaluate!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine lid(N) ((int) get_local_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\u001b[36m#\u001b[39;49;00m\u001b[36mdefine gid(N) ((int) get_group_id(N))\u001b[39;49;00m\u001b[36m\u001b[39;49;00m\n",
      "\n",
      "__kernel \u001b[36mvoid\u001b[39;49;00m __attribute__ ((reqd_work_group_size(\u001b[34m16\u001b[39;49;00m, \u001b[34m16\u001b[39;49;00m, \u001b[34m1\u001b[39;49;00m))) loopy_kernel(__global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ a, __global \u001b[36mfloat\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m *__restrict__ b, __global \u001b[36mfloat\u001b[39;49;00m *__restrict__ c, \u001b[36mint\u001b[39;49;00m \u001b[34mconst\u001b[39;49;00m n)\n",
      "{\n",
      "  \u001b[36mfloat\u001b[39;49;00m a_fetch[\u001b[34m16\u001b[39;49;00m];\n",
      "  \u001b[36mfloat\u001b[39;49;00m b_fetch[\u001b[34m16\u001b[39;49;00m];\n",
      "\n",
      "  \u001b[37m/* bulk slab for 'j_outer' */\u001b[39;49;00m\n",
      "  \u001b[37m/* bulk slab for 'i_outer' */\u001b[39;49;00m\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m-17\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m-17\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    b_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = b[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = a[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)] = a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] * b_fetch[lid(\u001b[34m1\u001b[39;49;00m)];\n",
      "  }\n",
      "  \u001b[37m/* final slab for 'i_outer' */\u001b[39;49;00m\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m16\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m-17\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    b_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = b[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = a[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)] = a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] * b_fetch[lid(\u001b[34m1\u001b[39;49;00m)];\n",
      "  }\n",
      "  \u001b[37m/* final slab for 'j_outer' */\u001b[39;49;00m\n",
      "  \u001b[37m/* bulk slab for 'i_outer' */\u001b[39;49;00m\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m-17\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m16\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * n >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      b_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = b[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = a[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)] = a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] * b_fetch[lid(\u001b[34m1\u001b[39;49;00m)];\n",
      "  }\n",
      "  \u001b[37m/* final slab for 'i_outer' */\u001b[39;49;00m\n",
      "  \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + gid(\u001b[34m1\u001b[39;49;00m) == \u001b[34m0\u001b[39;49;00m && \u001b[34m16\u001b[39;49;00m + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * n >= \u001b[34m0\u001b[39;49;00m && \u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m1\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "  {\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      b_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = b[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] = a[\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)];\n",
      "    \u001b[34mif\u001b[39;49;00m (\u001b[34m-1\u001b[39;49;00m + \u001b[34m-16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + \u001b[34m-1\u001b[39;49;00m * lid(\u001b[34m0\u001b[39;49;00m) + n >= \u001b[34m0\u001b[39;49;00m)\n",
      "      c[n * (\u001b[34m16\u001b[39;49;00m * gid(\u001b[34m0\u001b[39;49;00m) + lid(\u001b[34m0\u001b[39;49;00m)) + \u001b[34m16\u001b[39;49;00m * gid(\u001b[34m1\u001b[39;49;00m) + lid(\u001b[34m1\u001b[39;49;00m)] = a_fetch[lid(\u001b[34m0\u001b[39;49;00m)] * b_fetch[lid(\u001b[34m1\u001b[39;49;00m)];\n",
      "  }\n",
      "}\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sfetch_knl = knl\n",
    "sfetch_knl = lp.split_iname(sfetch_knl, \"i\", 16,\n",
    "        outer_tag=\"g.0\", inner_tag=\"l.0\", slabs=(0,1))\n",
    "sfetch_knl = lp.split_iname(sfetch_knl, \"j\", 16,\n",
    "        outer_tag=\"g.1\", inner_tag=\"l.1\", slabs=(0,1))\n",
    "\n",
    "sfetch_knl = lp.add_prefetch(sfetch_knl, \"a\", [\"i_inner\"], default_tag=\"l.auto\")\n",
    "sfetch_knl = lp.add_prefetch(sfetch_knl, \"b\", [\"j_inner\"], default_tag=\"l.auto\")\n",
    "sfetch_knl = lp.add_inames_for_unused_hw_axes(sfetch_knl, \"id:*fetch*\")\n",
    "\n",
    "evt, (mat,) = sfetch_knl(queue, a=a, b=b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1+"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
